library(minfi)
library(limma)
library(missMethyl)

################## KIDNEY CANCER ANALYSIS ############

# read in targets file with sample information
Ktargets<-read.delim("KidneyTargets.txt")
# read data into R
rgSet450 = read.450k.exp(targets = Ktargets)
# swan normalisation
mSet450SWAN = preprocessSWAN(rgSet450)

# read in annotation file
ann450 = read.csv("HumanMethylation450_15017482_v.1.2.csv",header=TRUE,sep=",",skip=7)
# Filter out poor quality probes
detP<-detectionP(rgSet450)
keep<-rowSums(detP<0.01) == ncol(rgSet450)
mSet450SWAN<-mSet450SWAN[keep,]
# Remove X and Y chromosome CpGs
mSet450SWAN = mSet450SWAN[!(featureNames(mSet450SWAN) %in% ann450$IlmnID[ann450$CHR %in% c("X","Y")]),]

# Retrieve beta and m values from MethylSet object
meth<-getMeth(mSet450SWAN)
unmeth<-getUnmeth(mSet450SWAN)
beta<-getBeta(mSet450SWAN)
Mval<-log2((meth+100)/(unmeth+100))
Aval<-0.5*log2(meth+unmeth)

# Set up designK matrix
group<-factor(pData(mSet450SWAN)$Sample_Group,levels=c("normal","Cancer"))
designK<-model.matrix(~group)
# Test for differential methylation
fit<-lmFit(Mval,designK)
fit$Amean <- rowMeans(Mval)
fit<-eBayes(fit,trend=T)

# Determine significant differentially methylated CpGs using FDR and deltaBeta cut-off of 0.1
betaNorm<-beta[,designK[,ncol(designK)]==0]
betaCan<-beta[,designK[,ncol(designK)]==1]
deltaBeta<-rowMeans(betaCan)-rowMeans(betaNorm)

adj.P.Val<-p.adjust(fit$p.value[,2],method="BH")
sigAllDM<- adj.P.Val < 0.05 & abs(deltaBeta) > 0.1

# Test for differential variability
fitvar<-varFit(Mval,designK,coef=2,trend=T,robust=T)

# Determine significant differentially variable CpGs using FDR and logVarRatio cut-off of log(5)
adj.P.ValDV<-p.adjust(fitvar$p.value[,2],method="BH")
sigAllDV<-adj.P.ValDV < 0.05 & abs(fitvar$LogVarRatio) > log(5)

#Annotate fit objects
q<-match(rownames(fit),ann450$IlmnID)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fit$genes<-ann
fitvar$genes<-ann

#Other tests of differential variability
# F-test

group <- designK[,2]
i.group <- names(table(group))
i.data1 <- Mval[,group==i.group[1]]
i.data2 <- Mval[,group==i.group[2]]
varnorm<-apply(i.data1,1,var)
varcan<-apply(i.data2,1,var)
Fstat<-varcan/varnorm
FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
adj.FPval<-p.adjust(FPval,method="BH")
Ftest<-data.frame(fit$genes,VarCan=varcan,VarNorm=varnorm,Fstat,FPval,adj.FPval)

## Bartlett test
narrays=ncol(Mval)
varp<-(159*varnorm+282*varcan)/(narrays-2)
top<-(narrays-2)*log(varp) - (282*log(varcan)+159*log(varnorm))
bottom<-1+(1/159+1/282-1/(narrays-2))/3
Bstat<-top/bottom
BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)
adj.BPval<-p.adjust(BartPval,method="BH")
Btest<-data.frame(fit$genes,Bstat=Bstat,BPval=BartPval,adj.BPval)

# Island-level analysis: Kidney
island<-fit$genes$Relation_to_UCSC_CpG_Island=="Island"
MvalIslandK<-Mval[island,]
island_name<-factor(as.character(fit$genes$UCSC_CpG_Islands_Name[island]))

# Averaging the CpG sites M values in each CpG island
newKMval<-matrix(0,nrow=length(levels(island_name)),ncol=ncol(Mval))
for(i in 1:ncol(newKMval)) newKMval[,i]<-tapply(MvalIslandK[,i],island_name,mean)
 
# Differential methylation analysis 
fitKI<-lmFit(newKMval,designK)
fitKI<-eBayes(fitKI)
fitKI$genes$Island_name<-levels(island_name)
fitKI$genes<-as.data.frame(fitKI$genes)

# Differential variability analysis
fitvarKI<-varFit(newKMval,designK,coef=ncol(designK))
fitvarKI$genes$Island_name<-levels(island_name)
fitvarKI$genes<-as.data.frame(fitvarKI$genes)

# Annotate fit objects
ann450$UCSC_CpG_Islands_Name<-as.character(ann450$UCSC_CpG_Islands_Name)
q<-match(fitvarKI$genes$Island_name,ann450$UCSC_CpG_Islands_Name)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitvarKI$genes<-ann
fitKI$genes<-ann

################## LUNG CANCER ANALYSIS ############

# read in targets file with sample information
Ltargets<-read.delim("LungTargets.txt")
# Read data into R
rgSetLung = read.450k.exp(targets = Ltargets)
# swan normalisation
mSetLung = preprocessSWAN(rgSetLung)

# Filter out poor quality probes
detP<-detectionP(rgSetLung)
keep<-rowSums(detP<0.01) == ncol(rgSetLung)
mSetLung<-mSetLung[keep,]
# Filter out X and Y chromosomes
mSetLung = mSetLung[!(featureNames(mSetLung) %in% ann450$IlmnID[ann450$CHR %in% c("X","Y")]),]
mdsPlot(mSetLung,sampNames= pData(mSetLung)$HID,sampGroups = pData(mSetLung)$group)

# Extract group information
group<-as.character(pData(mSetLung)$group)
group[pData(mSetLung)$group=="Solid Tissue Normal"]<-"normal"
group[pData(mSetLung)$group=="Primary solid Tumor"]<-"cancer"
group<-factor(group,levels=c("normal","cancer"))

# Extract M values and Beta values from MethylSet object
methL<-getMeth(mSetLung)
unmethL<-getUnmeth(mSetLung)
MvalLung<-log2((methL+100)/(unmethL+100))
betaL<-getBeta(mSetLung)

# Set up design matrix
designL<-model.matrix(~group)
# Test for differential methylation
fitLM<-lmFit(MvalLung,designL)
fitLM$Amean <- rowMeans(MvalLung)
fitLM<-eBayes(fitLM,trend=T)

# Determine significant differentially methylated CpGs with FDR and deltaBeta cut-off of 0.1
betaNormL<-betaL[,designL[,ncol(designL)]==0]
betaCanL<-betaL[,designL[,ncol(designL)]==1]
deltaBetaL<-rowMeans(betaCanL)-rowMeans(betaNormL)

adj.P.Val<-p.adjust(fitLM$p.value[,2],method="BH")
sigAllDML<- adj.P.Val < 0.05 & abs(deltaBetaL) > 0.1

# Test for differential variability
fitvarL<-varFit(MvalLung,designL,coef=ncol(designL))

# Determine significant differentially variable CpGs with FDR and logVarRatio cut-off of log(5)
adj.P.Val <- p.adjust(fitvarL$p.value[,2],method="BH")
sigAllDVL <- adj.P.Val < 0.05 & abs(fitvarL$LogVarRatio) > log(5)

#Annotate fit objects
q<-match(fitLM$genes[,1],ann450$IlmnID)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitLM$genes<-ann
fitvarL$genes<-ann

# Island-level analysis: Lung

island<-fitLM$genes$Relation_to_UCSC_CpG_Island=="Island"
MvalIslandL<-MvalLung[island,]
island_name<-factor(as.character(fitLM$genes$UCSC_CpG_Islands_Name[island]))

# Averaging the CpG sites M values in each CpG island
newLMval<-matrix(0,nrow=length(levels(island_name)),ncol=ncol(MvalLung))
for(i in 1:ncol(newLMval)) newLMval[,i]<-tapply(MvalIslandL[,i],island_name,mean)

# Differential methylation analysis 
fitLI<-lmFit(newLMval,designL)
fitLI<-eBayes(fitLI)
fitLI$genes$Island_name<-levels(island_name)
fitLI$genes<-as.data.frame(fitLI$genes)

# Differential variability analysis
fitvarLI<-varFit(newLMval,designL,coef=ncol(designL))
fitvarLI$genes$Island_name<-levels(island_name)
fitvarLI$genes<-as.data.frame(fitvarLI$genes)

# Annotate fit objects
q<-match(fitvarLI$genes$Island_name,ann450$UCSC_CpG_Islands_Name)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitvarLI$genes<-ann
fitLI$genes<-ann

############## PROSTATE CANCER ANALYSIS #################

# read in targets file with sample information
Ptargets<-read.delim("ProstateTargets.txt")
# Read data into R
rgSetProstate = read.450k.exp(targets = targetsProstate)
# swan normalisation
mSetProstate = preprocessSWAN(rgSetProstate)

# Filter out poor quality probes
detP<-detectionP(rgSetProstate)
keep<-rowSums(detP<0.01) == ncol(rgSetProstate)
mSetProstate<-mSetProstate[keep,]
# Filter out X and Y chromosome CpGs
mSetProstate = mSetProstate[!(featureNames(mSetProstate) %in% ann450$IlmnID[ann450$CHR %in% c("X","Y")]),]
mdsPlot(mSetProstate,sampNames= pData(mSetProstate)$HID,sampGroups = pData(mSetProstate)$group)

#Extract group information
group<-as.character(pData(mSetProstate)$group)
group[pData(mSetProstate)$group=="Solid Tissue Normal"]<-"normal"
group[pData(mSetProstate)$group=="Primary solid Tumor"]<-"cancer"
group<-factor(group,levels=c("normal","cancer"))

# Extract M values and Beta values from MethylSet object
methP<-getMeth(mSetProstate)
unmethP<-getUnmeth(mSetProstate)
MvalProstate<-log2((methP+100)/(unmethP+100))
betaP<-getBeta(mSetProstate)

# Test for differential methylation
designP<-model.matrix(~group)
fitPM<-lmFit(MvalProstate,designP)
fitPM$Amean <- rowMeans(MvalProstate)
fitPM<-eBayes(fitPM,trend=T)

# Determine significant differentially methylated CpGs using FDR and deltaBeta cut-off of 0.1
betaNormP<-betaP[,designP[,ncol(designP)]==0]
betaCanP<-betaP[,designP[,ncol(designP)]==1]
deltaBetaP<-rowMeans(betaCanP)-rowMeans(betaNormP)

adj.P.Val<-p.adjust(fitPM$p.value[,2],method="BH")
sigAllDMP<- adj.P.Val < 0.05 & abs(deltaBetaP) > 0.1

# Test for differential variability
fitvarP<-varFit(MvalProstate,designP,coef=ncol(designP))

# Determine significant differentially variable CpGs using FDR and logVarRatio cut-off of log(5)
adj.P.Val<-p.adjust(fitvarP$p.value[,2],method="BH")
sigAllDVP<-adj.P.Val < 0.05 & abs(fitvarP$LogVarRatio) > log(5)

#Annotate fit objects
q<-match(fitPM$genes[,1],ann450$IlmnID)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitPM$genes<-ann
fitvarP$genes<-ann


# Island-level analysis: Prostate
island<-fitPM$genes$Relation_to_UCSC_CpG_Island=="Island"
MvalIslandP<-MvalProstate[island,]
island_name<-factor(as.character(fitPM$genes$UCSC_CpG_Islands_Name[island]))

# Averaging the CpG sites M values in each CpG island
newPMval<-matrix(0,nrow=length(levels(island_name)),ncol=ncol(MvalProstate))
for(i in 1:ncol(newPMval)) newPMval[,i]<-tapply(MvalIslandP[,i],island_name,mean)

# Differential methylation analysis 
fitPI<-lmFit(newPMval,designP)
fitPI<-eBayes(fitPI)
fitPI$genes$Island_name<-levels(island_name)
fitPI$genes<-as.data.frame(fitPI$genes)

# Differential variability analysis
fitvarPI<-varFit(newPMval,designP,coef=ncol(designP))
fitvarPI$genes$Island_name<-levels(island_name)
fitvarPI$genes<-as.data.frame(fitvarPI$genes)

# Annotate fit objects
q<-match(fitvarPI$genes$Island_name,ann450$UCSC_CpG_Islands_Name)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitvarPI$genes<-ann
fitPI$genes<-ann


#################### UTERINE CANCER ANALYSIS #################

# read in targets file with sample information
targetsUC<-read.delim("UCECtargets.txt")
# Read data into R
rgSetUCEC = read.450k.exp(targets = targetsUC)
# swan normalisation
mSetUCEC = preprocessSWAN(rgSetUCEC)

# Filter out poor quality probes
detP<-detectionP(rgSetUCEC)
keep<-rowSums(detP<0.01) == ncol(rgSetUCEC)
mSetUCEC<-mSetUCEC[keep,]
# Filter out X and Y chromosome CpGs
mSetUCEC = mSetUCEC[!(featureNames(mSetUCEC) %in% ann450$IlmnID[ann450$CHR %in% c("X","Y")]),]
mdsPlot(mSetUCEC,sampNames= pData(mSetUCEC)$HID,sampGroups = pData(mSetUCEC)$group)

#Extract group information
group<-as.character(pData(mSetUCEC)$group)
group[pData(mSetUCEC)$group=="Solid Tissue Normal"]<-"normal"
group[pData(mSetUCEC)$group=="Primary solid Tumor"]<-"cancer"
group<-factor(group,levels=c("normal","cancer"))

designUC<-model.matrix(~group)

# Extract M values and Beta values from methylSet object
methUC<-getMeth(mSetUCEC)
unmethUC<-getUnmeth(mSetUCEC)
betaUC<-getBeta(mSetUCEC)
MvalUC<-log2((methUC+100)/(unmethUC+100))

# Differential methylation analysis
fitUM<-lmFit(MvalUC,designUC)
fitUM$Amean <- rowMeans(MvalUC)
fitUM<-eBayes(fitUM,trend=T,robust=T)

# Identify significant differentially methylated CpGs using FDR and deltaBeta cut-off of 0.1
betaNormUC<-betaUC[,designUC[,ncol(designUC)]==0]
betaCanUC<-betaUC[,designUC[,ncol(designUC)]==1]
deltaBetaUC<-rowMeans(betaCanUC)-rowMeans(betaNormUC)

adj.P.Val<-p.adjust(fitUM$p.value[,2],method="BH")
sigAllDMUC<- adj.P.Val < 0.05 & abs(deltaBetaUC) > 0.1

# Differential variability analysis
fitvarUC<-varFit(MvalUC,designUC,coef=ncol(designUC),trend=T,robust=T)

# Identify significant differentially variable CpGs using FDR and logVarRatio cut-off og log(5)
adj.P.Val<-p.adjust(fitvarUC$p.value[,2],method="BH")
sigAllDVUC<-adj.P.Val < 0.05 & abs(fitvarUC$LogVarRatio) > log(5)

#Annotate fit objects
q<-match(rownames(fitUM),ann450$IlmnID)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fitUM$genes<-ann
fitvarUC$genes<-ann

################### AGING DATA ANALYSIS ######################

# read in targets file with sample information
targets<-read.delim("targetsAge.txt",header=T)
# Read data into R
rgSetAge<-read.450k.exp(targets=targets)
# swan normalisation
mSetAge = preprocessSWAN(rgSetAge)

# Filter out poor quality probes
detP<-detectionP(rgSetAge)
keep<-rowSums(detP<0.01) == ncol(rgSetAge)
mSetAge<-mSetAge[keep,]

# Set up design matrix
group<-factor(targets$Sample_Group,levels=c("NewBorns","OLD"))
design<-model.matrix(~group)

# Extract M values and Beta values from methylSet object
meth<-getMeth(mSetAge)
unmeth<-getUnmeth(mSetAge)
beta<-getBeta(mSetAge)
Mval<-log2((meth+100)/(unmeth+100))

# Differential methylation
fit<-lmFit(Mval,design)
fit$Amean<-rowMeans(Mval)
fit<-eBayes(fit,trend=T,robust=T)

# Determine significant differentially methylated CpGs using FDR and deltaBeta cut-off of 0.1
adj.P.Val<-p.adjust(fit$p.value[,2],method="BH")
betaNB<-beta[,design[,2]==0]
betaOLD<-beta[,design[,2]==1]
deltaBeta<-rowMeans(betaOLD)-rowMeans(betaNB)
sigDM<-adj.P.Val < 0.05 & abs(deltaBeta) > 0.1

## Differential variability
fitvar<-varFit(Mval,design,coef=2,trend=T,robust=T)

adj.P.Val<-p.adjust(fitvar$p.value[,2],method="BH")
sigDV<-adj.P.Val < 0.05 & abs(fitvar$LogVarRatio) > log(5)
# Separate into more variable in old vs newborn (up) and more variable in newborn vs old (dn)
upDV<-adj.P.Val < 0.05 & fitvar$LogVarRatio > log(5)
dnDV<-adj.P.Val < 0.05 & fitvar$LogVarRatio < -log(5)

# Annotate fit objects
q<-match(rownames(fit),ann450$IlmnID)
ann<-ann450[q,c("IlmnID","CHR","UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_UCSC_CpG_Island","UCSC_CpG_Islands_Name","Chromosome_36","Coordinate_36")]
fit$genes<-ann
fitvar$genes<-ann

# GOstats analysis 
library(GO.db)
library(GOstats)
library(org.Hs.eg.db)

# Define significant DV genes
topgenes_updv<-as.character(fitvar$genes$UCSC_RefGene_Name[upDV])
topgenes_dndv<-as.character(fitvar$genes$UCSC_RefGene_Name[dnDV])
topgenes_updv<-gsub(";.*","",topgenes_updv)
topgenes_updv<-unique(topgenes_updv)

# Get Entrez gene IDs
symbolEG<-toTable(org.Hs.egSYMBOL2EG)
m<-match(topgenes_updv,symbolEG[,2])
topEG_updv <- symbolEG[m[!is.na(m)],1]

topgenes_dndv<-gsub(";.*","",topgenes_dndv)
topgenes_dndv<-unique(topgenes_dndv)
length(topgenes_dndv)
symbolEG<-toTable(org.Hs.egSYMBOL2EG)
m<-match(topgenes_dndv,symbolEG[,2])
topEG_dndv <- symbolEG[m[!is.na(m)],1]

# Define gene universe
allgenes<-as.character(fitvar$genes$UCSC_RefGene_Name)
allgenes<-gsub(";.*","",allgenes)
allgenes<-unique(allgenes)
m<-match(allgenes,symbolEG[,2])
allEG <- symbolEG[m[!is.na(m)],1]

# Get GO terms
GOterms<-toTable(org.Hs.egGO)

# remove genes with no GO category
uniqueG <-unique(GOterms$gene_id)
m<-match(allEG,uniqueG)
allEG<-allEG[!is.na(m)]
m<-match(topEG_updv,uniqueG)
topEG_updv<-topEG_updv[!is.na(m)]
m<-match(topEG_dndv,uniqueG)
topEG_dndv<-topEG_dndv[!is.na(m)]

cutoff = 0.0001
BPparams_updv <- new("GOHyperGParams", geneIds = topEG_updv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "BP",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

CCparams_updv <- new("GOHyperGParams", geneIds = topEG_updv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "CC",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

MFparams_updv<- new("GOHyperGParams", geneIds = topEG_updv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "MF",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

BPparams_dndv <- new("GOHyperGParams", geneIds = topEG_dndv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "BP",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

CCparams_dndv <- new("GOHyperGParams", geneIds = topEG_dndv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "CC",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

MFparams_dndv<- new("GOHyperGParams", geneIds = topEG_dndv, universeGeneIds = allEG, annotation = "org.Hs.eg.db", ontology = "MF",pvalueCutoff=cutoff,conditional=FALSE,testDirection="over")

# Perform GOstats test

BPhgOver_updv <- hyperGTest(BPparams_updv)
CChgOver_updv <- hyperGTest(CCparams_updv)
MFhgOver_updv <- hyperGTest(MFparams_updv)

BPhgOver_dndv <- hyperGTest(BPparams_dndv)
CChgOver_dndv <- hyperGTest(CCparams_dndv)
MFhgOver_dndv <- hyperGTest(MFparams_dndv)

# save html report

htmlReport(BPhgOver_updv, file = "GO_results_Age_updv.html", label="Biological Process Results - Old var", digits=10)
htmlReport(CChgOver_updv, file = "GO_results_Age_updv.html", label="Cellular Component results - Old var", digits=10,append=T)
htmlReport(MFhgOver_updv, file = "GO_results_Age_updv.html", label="Molecular Function results - Old var", digits=10,append=T)

htmlReport(BPhgOver_dndv, file = "GO_results_Age_dndv.html", label="Biological Process Results - Young var", digits=10)
htmlReport(CChgOver_dndv, file = "GO_results_Age_dndv.html", label="Cellular Component results - Young var", digits=10,append=T)
htmlReport(MFhgOver_dndv, file = "GO_results_Age_dndv.html", label="Molecular Function results - Young var", digits=10,append=T)
